Source code for nlp_architect.data.cdc_resources.gen_scripts.create_word_embed_elmo_dump

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import argparse
import logging
import os
import pickle
from os.path import join

from nlp_architect.common.cdc.mention_data import MentionData
from nlp_architect.data.cdc_resources.embedding.embed_elmo import ElmoEmbedding
from nlp_architect.utils import io

logger = logging.getLogger(__name__)


[docs]def load_elmo_for_vocab(mentions): """ Create the embedding using the cache logic in the embedding class Args: mentions: Returns: """ elmo_embeddings = ElmoEmbedding() for mention in mentions: elmo_embeddings.get_head_feature_vector(mention) logger.info('Total words/contexts in vocabulary %d', len(elmo_embeddings.cache)) return elmo_embeddings.cache
[docs]def elmo_dump(): out_file = args.output mention_files = list() if os.path.isdir(args.mentions): for (dirpath, _, files) in os.walk(args.mentions): for file in files: if file == '.DS_Store': continue mention_files.append(join(dirpath, file)) else: mention_files.append(args.mentions) mentions = [] for _file in mention_files: mentions.extend(MentionData.read_mentions_json_to_mentions_data_list(_file)) elmo_ecb_embeddings = load_elmo_for_vocab(mentions) with open(out_file, 'wb') as f: pickle.dump(elmo_ecb_embeddings, f) logger.info('Saving dump to file-%s', out_file)
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Create Elmo Embedding dataset only dump') parser.add_argument('--mentions', type=str, help='mentions_file file', required=True) parser.add_argument('--output', type=str, help='location were to create dump file', required=True) args = parser.parse_args() if os.path.isdir(args.mentions): io.validate_existing_directory(args.mentions) else: io.validate_existing_filepath(args.mentions) elmo_dump() print('Done!')